R Markdown

Part 0 - data organization and exploration

library(prettydoc)
library(arsenal)
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(ggplot2)
library(here)
## here() starts at /Users/stephcopeland/Documents/GitHub/222 Final Project/last take
library(tidyr)
library(DescTools)
## 
## Attaching package: 'DescTools'
## The following objects are masked from 'package:arsenal':
## 
##     %nin%, N
library(tibble)
library(calecopal)
library(ggeffects)
library(gt)
library(plotly)
## 
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
## 
##     last_plot
## The following object is masked from 'package:stats':
## 
##     filter
## The following object is masked from 'package:graphics':
## 
##     layout
library(viridis)
## Loading required package: viridisLite
library(hrbrthemes)
## NOTE: Either Arial Narrow or Roboto Condensed fonts are required to use these themes.
##       Please use hrbrthemes::import_roboto_condensed() to install Roboto Condensed and
##       if Arial Narrow is not on your system, please see https://bit.ly/arialnarrow
library(modelr)
## 
## Attaching package: 'modelr'
## The following object is masked from 'package:ggeffects':
## 
##     data_grid
library(gtsummary)
data1 <- read.csv("last_take.csv")
data2 <- data1 %>% 
  mutate(prop_tree_loss = (tree_cover_loss_ha/area_sqkm)) %>% 
  mutate(prop_dalys = (dalys/population)) %>% 
  mutate(area_ha = (area_sqkm * 100)) %>% 
  mutate(tree_cover_loss = (tree_cover_loss_ha/area_ha)) %>% 
  select(-prop_tree_loss) %>% 
  select(-tree_cover_loss_ha) %>% 
  select(-dalys)
ggplot(data2, aes(sample = prop_dalys)) +
  geom_qq() + geom_qq_line()
## Warning: Removed 15 rows containing non-finite values (stat_qq).
## Warning: Removed 15 rows containing non-finite values (stat_qq_line).

data3 <- data2 %>% 
  mutate(logdalys = log(prop_dalys))
ggplot(data3, aes(sample = logdalys)) +
  geom_qq() + geom_qq_line()
## Warning: Removed 15 rows containing non-finite values (stat_qq).
## Warning: Removed 15 rows containing non-finite values (stat_qq_line).

data4 <- data3 %>% 
  mutate(log_GDP = log(GDP_capita))
ggplot(data4, aes(y = logdalys, x = tree_cover_loss)) +
  geom_point() +
  stat_smooth(method = "lm", se = FALSE)
## `geom_smooth()` using formula 'y ~ x'
## Warning: Removed 15 rows containing non-finite values (stat_smooth).
## Warning: Removed 15 rows containing missing values (geom_point).

data4 <- data4 %>% 
  mutate(win_prop_tree = Winsorize(tree_cover_loss, minval = NULL, maxval = NULL, probs = c(0.00, 0.95), na.rm = TRUE, type = 9))

Part 1 - Global impacts of deforestation on NTD DALY burden

mod1 <- lm(logdalys ~ win_prop_tree, data = data4)

summary(mod1)
## 
## Call:
## lm(formula = logdalys ~ win_prop_tree, data = data4)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -7.0167 -1.2604 -0.0954  1.6650  4.9776 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)    -6.9981     0.2408 -29.066   <2e-16 ***
## win_prop_tree  71.5909    50.2008   1.426    0.156    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 2.561 on 172 degrees of freedom
##   (15 observations deleted due to missingness)
## Multiple R-squared:  0.01169,    Adjusted R-squared:  0.00594 
## F-statistic: 2.034 on 1 and 172 DF,  p-value: 0.1557
graph1 <- ggplot(data4, aes(y = logdalys, x = win_prop_tree)) +
  geom_point(color = 'darkblue', size = 3) +
  stat_smooth(method = "lm", color = "red", fill = "#69b3a2", se = TRUE) +
  labs(x = "Proportion of Forest Loss (ha)", y = "Proportion of DALYs (log)") +
  ggtitle("Effect of Forest Loss on Disability Adjusted Life Years (DALYs) by country") +
  theme_bw() +
  theme(plot.title=element_text(hjust=0.5, vjust=0.5))
graph1
## `geom_smooth()` using formula 'y ~ x'
## Warning: Removed 15 rows containing non-finite values (stat_smooth).
## Warning: Removed 15 rows containing missing values (geom_point).

fire <- cal_palette(name = "fire", n = 15, type = "continuous")
superbloom <- cal_palette(name = "superbloom3", n = 15, type = "continuous")
kelp <- cal_palette(name = "kelp1", n = 15, type = "continuous")

kelp1 <- cal_palette(name = "kelp1", n = 2, type = "continuous")
graph2 <- ggplot(data4, aes(y = logdalys, x = win_prop_tree, color = log_GDP)) +
  geom_point(size = 3) +
  stat_smooth(method = "lm", color = "red", fill = "#69b3a2", se = TRUE) +
  labs(x = "Proportion of Forest Loss (ha)", y = "Proportion of DALYs (log)") +
  ggtitle("Effect of Forest Loss on Disability Adjusted Life Years (DALYs) by Country") +
  theme_bw()

graph2 +
  guides(size = FALSE) +
  labs(colour = "GDP per Capita") +
  theme(legend.position = "bottom") +
  theme(plot.title=element_text(hjust=0.5, vjust=0.5)) +
  scale_color_gradientn(colours = kelp)
## Warning: `guides(<scale> = FALSE)` is deprecated. Please use `guides(<scale> =
## "none")` instead.
## `geom_smooth()` using formula 'y ~ x'
## Warning: Removed 15 rows containing non-finite values (stat_smooth).
## Warning: Removed 15 rows containing missing values (geom_point).

Hypothesis Testing and OLS violations Does deforestation contribute to the incidence and burden of neglected tropical diseases within countries?

H0: There is no effect of deforestation and NTD burden. B1 = 0 HA: There will be a positive effect between deforestation and NTD burden. A country with high forest loss will have a high NTD burden whereas a country with low forest loss will have low NTD burden B1 =/ 0

Predictions: I expect to fail to reject the null, this particularly data set is unlikely to capture the intricacies of NTD prevalence and estimation in relation to deforestation.

summary(mod1)
## 
## Call:
## lm(formula = logdalys ~ win_prop_tree, data = data4)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -7.0167 -1.2604 -0.0954  1.6650  4.9776 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)    -6.9981     0.2408 -29.066   <2e-16 ***
## win_prop_tree  71.5909    50.2008   1.426    0.156    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 2.561 on 172 degrees of freedom
##   (15 observations deleted due to missingness)
## Multiple R-squared:  0.01169,    Adjusted R-squared:  0.00594 
## F-statistic: 2.034 on 1 and 172 DF,  p-value: 0.1557

Point Estimates: B0 = -6.9981 B1 = 71.5909

Std. Error: 50.2008

t-statistic: 1.426

p-value: 0.156

R-squared: 0.01169

OLS Assumptions - unbiased and lowest variance correlation: I’m not sure if you really need this with the regression, it’s just another way of showing that this relationship is “insignificant”

data4 %>% 
  summarize(dalys_cor = cor(logdalys, win_prop_tree, use = "complete.obs"))
##   dalys_cor
## 1 0.1081012

0.108 (zero correlation)

  1. ‘yes’ to an extent obviously hugely variable
  2. Can not test but has likely in this case been violated (diseases are hardly ever explained by just one variable). Go more into this in part 2
  3. Yes
  4. Mean residuals (4.200678e-15)
predictions <- data4 %>% add_predictions(mod1) %>%
  mutate(residuals = logdalys-pred)

mean(predictions$residuals, na.rm = TRUE)
## [1] 4.200678e-15
ggplot(predictions, aes(residuals)) +
  geom_histogram(color = "dark blue", fill = "dark blue", bins = 80) +
  geom_density(alpha = 0.2, fill="lightblue") + 
  theme_bw()
## Warning: Removed 15 rows containing non-finite values (stat_bin).
## Warning: Removed 15 rows containing non-finite values (stat_density).

ggplot(predictions) + 
  geom_point(aes(x=win_prop_tree, y=residuals), color = "dark blue", size = 3, alpha = .5) +
  theme_bw()
## Warning: Removed 15 rows containing missing values (geom_point).

Part 2 - Impact of deforestation AND GDP per capita on country NTD DALY burden

mod2 <- lm(logdalys ~ win_prop_tree*GDP_capita, data = data4)

summary(mod2)
## 
## Call:
## lm(formula = logdalys ~ win_prop_tree * GDP_capita, data = data4)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -6.1610 -1.2681  0.0039  1.1068  5.2214 
## 
## Coefficients:
##                            Estimate Std. Error t value Pr(>|t|)    
## (Intercept)              -5.960e+00  2.289e-01 -26.034  < 2e-16 ***
## win_prop_tree             1.094e+02  5.146e+01   2.125    0.035 *  
## GDP_capita               -7.401e-05  9.474e-06  -7.812 5.68e-13 ***
## win_prop_tree:GDP_capita -4.193e-03  2.700e-03  -1.553    0.122    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 2.005 on 169 degrees of freedom
##   (16 observations deleted due to missingness)
## Multiple R-squared:  0.4041, Adjusted R-squared:  0.3936 
## F-statistic: 38.21 on 3 and 169 DF,  p-value: < 2.2e-16
graph3 <- ggplot(data4, aes(y = logdalys, x = log_GDP, color = log_GDP)) +
  geom_point(size = 3) +
  stat_smooth(method = "lm", color = "blue", fill = "#69b3a2", se = TRUE) +
  labs(x = "GDP per capita", y = "Proportion of DALYs (log)") +
  ggtitle("Effect of Economic Wealth on Disability Adjusted Life Years (DALYs) by country") +
  theme_bw()

graph3 +
  guides(size = FALSE) +
  labs(colour = "GDP per Capita") +
  theme(legend.position = "bottom") +
  theme(plot.title=element_text(hjust=0.5, vjust=0.5)) +
  scale_color_gradientn(colours = kelp)
## Warning: `guides(<scale> = FALSE)` is deprecated. Please use `guides(<scale> =
## "none")` instead.
## `geom_smooth()` using formula 'y ~ x'
## Warning: Removed 16 rows containing non-finite values (stat_smooth).
## Warning: Removed 16 rows containing missing values (geom_point).

graph4 <- ggplot(data4, aes(y = logdalys, x = log_GDP, color = win_prop_tree)) +
  geom_point(size = 3) +
  stat_smooth(method = "lm", color = "blue", fill = "#69b3a2", se = TRUE) +
  labs(x = "GDP per capita", y = "Proportion of DALYs (log)") +
  ggtitle("Effect of Economic Wealth on Disability Adjusted Life Years (DALYs) by country") +
  theme_bw()

graph4 +
  guides(size = FALSE) +
  labs(colour = "Proportion of Forest Loss") +
  theme(legend.position = "bottom") +
  theme(plot.title=element_text(hjust=0.5, vjust=0.5)) +
  scale_color_gradientn(colours = superbloom)
## Warning: `guides(<scale> = FALSE)` is deprecated. Please use `guides(<scale> =
## "none")` instead.
## `geom_smooth()` using formula 'y ~ x'
## Warning: Removed 16 rows containing non-finite values (stat_smooth).
## Warning: Removed 16 rows containing missing values (geom_point).

Hypothesis Testing and OLS violations - adding income to model Does deforestation affect the incidence and burden of neglected tropical diseases within countries when the model contributes for the affect of country economic wealth?

H0: There is no effect of deforestation and NTD burden. B1 = 0 HA: There will be a positive effect between deforestation and NTD burden. Low economic, tropical countries with high deforestation rates will have a high incidence of DALYs per their populations. B1 =/ 0

Predictions: I expect to fail to reject the null, even this more focused subset of the data set is unlikely to capture the intricacies of NTD prevalence and estimation in relation to deforestation.

summary(mod2)
## 
## Call:
## lm(formula = logdalys ~ win_prop_tree * GDP_capita, data = data4)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -6.1610 -1.2681  0.0039  1.1068  5.2214 
## 
## Coefficients:
##                            Estimate Std. Error t value Pr(>|t|)    
## (Intercept)              -5.960e+00  2.289e-01 -26.034  < 2e-16 ***
## win_prop_tree             1.094e+02  5.146e+01   2.125    0.035 *  
## GDP_capita               -7.401e-05  9.474e-06  -7.812 5.68e-13 ***
## win_prop_tree:GDP_capita -4.193e-03  2.700e-03  -1.553    0.122    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 2.005 on 169 degrees of freedom
##   (16 observations deleted due to missingness)
## Multiple R-squared:  0.4041, Adjusted R-squared:  0.3936 
## F-statistic: 38.21 on 3 and 169 DF,  p-value: < 2.2e-16

Point Estimates: B0 = -5.96 B1 = 1.094

Std. Error: 5.146e+01

t-statistic: 2.125

p-value: 0.035 *

R-squared: 0.3936

OLS Assumptions - unbiased and lowest variance

  1. ‘yes’ to an extent obviously hugely variable
  2. Can not test but has likely in this case been violated (diseases are hardly ever explained by just a couple). Go more into this…
  3. Yes
  4. Mean residuals: -3.973686e-15 (so basically zero, satisfying unbiasedness) BUT looking at geom_point graph those error terms are really tightly clumped and not homogeneous so there might be biasness there.
predictions2 <- data4 %>% add_predictions(mod2) %>%
 mutate(residuals = logdalys-pred)

#ggplot(data=predictions2) + geom_histogram(aes(residuals), bins=80)

ggplot(predictions2, aes(residuals)) +
  geom_histogram(color = "dark blue", fill = "dark blue", bins = 80) +
  geom_density(alpha = 0.2, fill="lightblue") + 
  theme_bw()
## Warning: Removed 16 rows containing non-finite values (stat_bin).
## Warning: Removed 16 rows containing non-finite values (stat_density).

mean(predictions2$residuals, na.rm = TRUE)
## [1] -3.973686e-15
#ggplot(predictions) + geom_point(aes(x=GDP_capita, y=residuals))

ggplot(predictions2) + 
  geom_point(aes(x=win_prop_tree, y=residuals), color = "dark blue", size = 3, alpha = .5) +
  theme_bw()
## Warning: Removed 16 rows containing missing values (geom_point).

Part 3 - Impact of deforestation and income in tropical demarkedly “low income” nations

data5 <- data4 %>% 
  mutate(country_income_level = 
           case_when(GDP_capita >= 13000 ~ "high", 
                     GDP_capita >= 4000 ~ "middle", 
                     GDP_capita <= 4000 ~ "low"))
data5 %>% 
  group_by(country_income_level) %>% 
  summarize(n = n())
## # A tibble: 4 × 2
##   country_income_level     n
##   <chr>                <int>
## 1 high                    52
## 2 low                     72
## 3 middle                  51
## 4 <NA>                    14
equLOWcountries <- c("Burundi", "Benin", "Bhangladesh", "Bolivia", "Central African Republic", "Cote d'Ivoire", "Cameroon", "Democratic Republic of Congo", "Congo", "Comoros", "Ethiopia", "Ghana", "Guinea", "Gambia", "Honduras", "Haiti", "Indonesia", "Kenya", "Cambodia", "Laos", "Liberia", "Madagascar", "Mali", "Myanmar", "Mozambique",
"Mauritania", "Malawi", "Nigeria", "Nicaragua", "Phillipines", "Rwanda", "Senegal", "Sierra Leone", "El Salvador", "Eswatini", "Togo", "Tanzania", "Uganda", "Vietnam", "Zambia", "Zimbabwe")
noneqLOWcountries <- c("AFG", "ARM", "BTN", "CPV", "EGY", "ERI", "FSM", "GNB", "IND", "KGZ", "LSO", "MAR", "MDA", "MNG", "NER", "NPL", "PAK", "PNG", "SDN", "SLB", "SSD", "STP", "SYR", "TCD", "TJK", "TLS", "TUN", "UKR", "UZB", "VUT")
data6 <- data5 %>% 
  filter(country_income_level == "low") %>% 
  filter(!(code %in% c(noneqLOWcountries)))
mod3 <- lm(logdalys ~ win_prop_tree, data = data6)

summary(mod3)
## 
## Call:
## lm(formula = logdalys ~ win_prop_tree, data = data6)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -2.499 -1.321  0.180  1.315  2.359 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)    -3.9986     0.3069 -13.030  5.6e-16 ***
## win_prop_tree -30.4394    47.7265  -0.638    0.527    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.417 on 40 degrees of freedom
## Multiple R-squared:  0.01007,    Adjusted R-squared:  -0.01468 
## F-statistic: 0.4068 on 1 and 40 DF,  p-value: 0.5272
graph5 <- ggplot(data6, aes(y = logdalys, x = win_prop_tree, color = log_GDP)) +
  geom_point(size = 4) +
  stat_smooth(method = "lm", color = "red", fill = "#69b3a2", se = TRUE) +
  labs(x = "Proportion of Forest Loss (ha)", y = "Proportion of DALYs (log)") +
  ggtitle("Effect of Forest Loss on Disability Adjusted Life Years (DALYs) \n for Low Income, Tropical Nations") +
  theme_bw()

graph5 +
  guides(size = FALSE) +
  labs(colour = "GDP per Capita") +
  theme(legend.position = "bottom") +
  theme(plot.title=element_text(hjust=0.5, vjust=0.5)) +
  scale_color_gradientn(colours = kelp)
## Warning: `guides(<scale> = FALSE)` is deprecated. Please use `guides(<scale> =
## "none")` instead.
## `geom_smooth()` using formula 'y ~ x'

graph6 <- ggplot(data6, aes(y = logdalys, x = GDP_capita, color = win_prop_tree)) +
  geom_point(size = 4) +
  stat_smooth(method = "lm", color = "red", fill = "#69b3a2", se = TRUE) +
  labs(x = "GDP per capita", y = "Proportion of DALYs (log)") +
  ggtitle("Effect of Economic Wealth on Disability Adjusted Life Years (DALYs) \n for Low Income, Tropical Nations") +
  theme_bw()

graph6 +
  guides(size = FALSE) +
  labs(colour = "Proportion of Forest Loss") +
  theme(legend.position = "bottom") +
  theme(plot.title=element_text(hjust=0.5, vjust=0.5)) +
  scale_color_gradientn(colours = superbloom)
## Warning: `guides(<scale> = FALSE)` is deprecated. Please use `guides(<scale> =
## "none")` instead.
## `geom_smooth()` using formula 'y ~ x'

mod4 <- lm(logdalys ~ win_prop_tree * GDP_capita, data = data6)

summary(mod4)
## 
## Call:
## lm(formula = logdalys ~ win_prop_tree * GDP_capita, data = data6)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1.90404 -0.93667  0.00806  0.77030  1.99482 
## 
## Coefficients:
##                            Estimate Std. Error t value Pr(>|t|)    
## (Intercept)              -3.215e+00  4.262e-01  -7.543 4.59e-09 ***
## win_prop_tree             1.014e+02  7.204e+01   1.408   0.1674    
## GDP_capita               -5.546e-04  2.762e-04  -2.008   0.0518 .  
## win_prop_tree:GDP_capita -7.232e-02  4.452e-02  -1.625   0.1125    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.075 on 38 degrees of freedom
## Multiple R-squared:  0.4589, Adjusted R-squared:  0.4162 
## F-statistic: 10.74 on 3 and 38 DF,  p-value: 2.989e-05
data6 %>%
  arrange(desc(population)) %>%
  mutate(country = factor(country, country)) %>%
  ggplot(aes(x=GDP_capita, y=logdalys, size=win_prop_tree, fill= GDP_capita)) +
    stat_smooth(method = "lm", color = "black", se = FALSE) +
    geom_point(alpha=0.5, shape=21, color="black") +
    scale_size(range = c(1.4, 19), name="Population (M)") +
    scale_fill_viridis(discrete=FALSE, guide=FALSE, option="A") +
    theme_bw() +
    theme(legend.position="bottom") +
    ylab("DALYs (log)") +
    xlab("Gdp per Capita") +
    theme(legend.position = "none")
## `geom_smooth()` using formula 'y ~ x'

p <- data6 %>%
  # Reorder countries to having big bubbles on top
  arrange(desc(win_prop_tree)) %>%
  mutate(country = factor(country, country)) %>% 
  mutate(per_dalys = (prop_dalys * 100)) %>% 
  mutate(per_tree = win_prop_tree* 100) %>% 
  mutate(per_dalys = round(per_dalys, 2)) %>% 
  mutate(per_tree = round(per_tree, 2)) %>% 
  mutate(GDP_capita = round(GDP_capita, 2)) %>% 
  mutate(text = paste("Country: ", country, "\nPopulation (M): ", population, "\n% DALYs of Population: ", per_dalys, "\nGdp per capita: ", GDP_capita, "\n % Tree Loss of Country: ", per_tree, sep="")) %>% 

  ggplot(aes(x=GDP_capita, y=logdalys, size = per_tree, color = GDP_capita, text=text)) +
    geom_point(alpha=0.7) +
    scale_size(range = c(1.4, 19), name="% Tree Loss") +
    scale_color_viridis(discrete=FALSE, guide=FALSE, option = "A") +
    theme_bw() +
    theme(legend.position="none")

# turn ggplot interactive with plotly
pp <- ggplotly(p, tooltip="text")
pp

Hypothesis Testing and OLS Assumptions for Low Income & Tropical Countries Does deforestation affect the incidence and burden of neglected tropical diseases within countries that a geographically tropical and disadvantaged on the global economic spectrum?

H0: There is no effect of deforestation and NTD burden. B1 = 0 HA: There will be a positive effect between deforestation and NTD burden. Low economic, tropical countries with high deforestation rates will have a high incidence of DALYs per their populations. B1 =/ 0

Predictions: I expect to fail to reject the null, even this more focused subset of the data set is unlikely to capture the intricacies of NTD prevalence and estimation in relation to deforestation.

summary(mod4)
## 
## Call:
## lm(formula = logdalys ~ win_prop_tree * GDP_capita, data = data6)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1.90404 -0.93667  0.00806  0.77030  1.99482 
## 
## Coefficients:
##                            Estimate Std. Error t value Pr(>|t|)    
## (Intercept)              -3.215e+00  4.262e-01  -7.543 4.59e-09 ***
## win_prop_tree             1.014e+02  7.204e+01   1.408   0.1674    
## GDP_capita               -5.546e-04  2.762e-04  -2.008   0.0518 .  
## win_prop_tree:GDP_capita -7.232e-02  4.452e-02  -1.625   0.1125    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.075 on 38 degrees of freedom
## Multiple R-squared:  0.4589, Adjusted R-squared:  0.4162 
## F-statistic: 10.74 on 3 and 38 DF,  p-value: 2.989e-05

Point Estimates: B0 = -3.2146577 B1 = 1.0140100

Std. Error: 0.7203759

t-statistic: 1.408

p-value: 0.1674

R-squared: 0.4162

OLS Assumptions - unbiased and lowest variance 1. ‘yes’ to an extent obviously hugely variable 2. Can not test but has likely in this case been violated (diseases are hardly ever explained by just a couple). Go more into this… 3. Yes 4. Mean Residuals:-3.172066e-16, this is the least unbiased of the three models

predictions4 <- data6 %>% add_predictions(mod4) %>%
  mutate(residuals = logdalys-pred)

#ggplot(data=predictions2) + geom_histogram(aes(residuals), bins=80)

ggplot(predictions4, aes(residuals)) +
  geom_histogram(color = "dark blue", fill = "dark blue", bins = 80) +
  geom_density(alpha = 0.2, fill="lightblue") + 
  theme_bw()

mean(predictions4$residuals, na.rm = TRUE)
## [1] -3.172066e-16
#ggplot(predictions) + geom_point(aes(x=GDP_capita, y=residuals))

ggplot(predictions4) + 
  geom_point(aes(x=win_prop_tree, y=residuals), color = "dark blue", size = 3, alpha = .5) +
  theme_bw()